#install.packages("dplyr")
#install.packages("lubridate")
#install.packages("ggplot2")
#install.packages("corrplot")
#install.packages("PerformanceAnalytics")
#install.packages("randomForest")
#install.packages("caret")
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.4.3
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
library(PerformanceAnalytics)
## Warning: package 'PerformanceAnalytics' was built under R version 4.4.3
## Loading required package: xts
## Warning: package 'xts' was built under R version 4.4.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.4.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
library(cluster)
# data is taken from https://www.kaggle.com/datasets/ethon0426/lending-club-20072020q1
data <- read.csv('Loan_status_2007-2020Q3.gzip')
head(data)
start_date = "2019-01-01"
end_date = "2019-12-31"
data <- data %>%
mutate(
issue_date_parsed = parse_date_time(issue_d, orders = "b-Y")
)
df_2019 <- data %>% filter(issue_date_parsed >= ymd(start_date) &
issue_date_parsed <= ymd(end_date))
rm(data)
head(df_2019)
Removal of columns where number na is bigger than threshold
threshold <- 0.5
missing_pct <- colMeans(is.na(df_2019))
df_clean_2019 <- df_2019[, missing_pct <= threshold]
df_clean_2019